from sklearn.feature_extraction.text import CountVectorizer
import pickle
import sys


def data_iter(in_file):
    with open(in_file) as fd:
        for l in fd:
            x = l.strip()
            yield x


if __name__ == '__main__':
    data_file = sys.argv[1]
    v_file = sys.argv[2]

    X = data_iter(data_file)

    CV = CountVectorizer(token_pattern=r'\b\w*\b', min_df=1)
    X = CV.fit_transform(X)

    with open(v_file, 'wb+') as out_fd:
        pickle.dump(CV, out_fd)
